Data is from https://gender-pay-gap.service.gov.uk/viewing/download
#install.packages("tidyverse")
#install.packages("janitor")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.2 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# here is how I would read the data files one at a time
gender_pay_2017_18 <- read_csv("data/UK Gender Pay Gap Data - 2017 to 2018.csv")
## Rows: 10219 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): EmployerName, Address, PostCode, CompanyNumber, SicCodes, CompanyL...
## dbl (15): EmployerId, DiffMeanHourlyPercent, DiffMedianHourlyPercent, DiffMe...
## lgl (1): SubmittedAfterTheDeadline
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gender_pay_2018_19 <- read_csv("data/UK Gender Pay Gap Data - 2018 to 2019.csv")
## Rows: 10459 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): EmployerName, Address, PostCode, CompanyNumber, SicCodes, CompanyL...
## dbl (15): EmployerId, DiffMeanHourlyPercent, DiffMedianHourlyPercent, DiffMe...
## lgl (1): SubmittedAfterTheDeadline
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# read in multiple files in one:
list_of_files <- list.files(path = "data", # specify the folder path
pattern = "\\.csv$", # only return files that end in csv
full.names = TRUE # foldeer path attached to the beginning of the file name
)
pay_combined <- read_csv(list_of_files, id = "file_name")
## Rows: 59315 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): EmployerName, Address, PostCode, CompanyNumber, SicCodes, CompanyL...
## dbl (15): EmployerId, DiffMeanHourlyPercent, DiffMedianHourlyPercent, DiffMe...
## lgl (1): SubmittedAfterTheDeadline
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pay_combined
pay_trimmed <- pay_combined %>%
clean_names() %>%
mutate(year_starting = as.numeric(str_extract(file_name, "[0-9]+")), .before = 1) %>% # take the first year listed in the file name
select(contains(c("year", "post", "percent", "quartile", "size"))) # selecting desired columns
pay_trimmed
scottish_postcodes <- c("AB", "DD", "PH", "FK", "G", "PA", "DG", "KA", "ML", "DG", "EH", "KY", "IV", "TD", "ZE", "HS")
Cut down the postcode column to just the first section and use this to determine if a company is in scotland or not:
pay_region <- pay_trimmed %>%
mutate(post_code = str_extract(post_code, "[A-Z0-9]+")) %>%
mutate(post_code = str_remove_all(post_code, "[0-9]+")) %>%
mutate(region = if_else(post_code %in% scottish_postcodes, "Scotland", "Rest of UK"), .after = post_code) %>%
select(-post_code)
pay_region
diff_mean_hourly_percent: mean % difference between male and female hourly pay (negative = women’s mean hourly pay is higher)
pay_region %>%
filter(region == "Scotland") %>%
ggplot() +
geom_boxplot(aes(x = year_starting, y = diff_mean_hourly_percent, group = year_starting), colour = "blue", alpha = 0.5) +
labs(
title = "Gender pay differences across Scottish companies",
subtitle = "2017 - 2023\n",
x = "\nFinancial year starting",
y = "Mean % difference in hourly pay\n(M - F)"
) +
scale_x_continuous(breaks = c(2017, 2018, 2019, 2020, 2021, 2022, 2023)) +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.line = element_line(colour = "black")) +
geom_hline(yintercept = 0, linetype = "dashed") +
coord_flip()
Let’s check out 2023
pay_region %>%
filter(region == "Scotland") %>%
group_by(year_starting) %>%
summarise(nrows = n())
male_bonus_percent: % of male employees paid a bonus
female_bonus_percent: % of female employees paid a bonus
pay_region %>%
filter(region == "Scotland") %>%
select(male_bonus_percent, female_bonus_percent) %>%
filter(male_bonus_percent != 0 & female_bonus_percent != 0) %>%
pivot_longer(cols = c("male_bonus_percent",
"female_bonus_percent"),
names_to = "gender",
values_to = "bonus_percent") %>%
ggplot() +
geom_boxplot(aes(x = gender, y = bonus_percent, colour = gender), linewidth = 1.5, show.legend = FALSE) +
labs(
title = "Bonuses paid across Scottish companies",
subtitle = "2017 - 2023\n",
x = "",
y = "% of employees paid a bonus"
) +
scale_x_discrete(labels = c("Women", "Men")) +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.line = element_line(colour = "black"))
Can see that there is very little difference in the bonuses paid to men and women across Scottish companies.
pay_region %>%
filter(region == "Scotland") %>%
mutate(employer_size = factor(employer_size, levels = c("Less than 250", "250 to 499", "500 to 999", "1000 to 4999", "5000 to 19,999", "20,000 or more", "Not Provided"))) %>%
filter(!employer_size %in% c("Not Provided", NA)) %>%
ggplot() +
geom_boxplot(aes(x = employer_size, y = diff_mean_hourly_percent), colour = "blue") +
labs(
title = "Gender pay differences across Scottish companies",
subtitle = "2017 - 2023\n",
x = "\nCompany size",
y = "Mean % difference in hourly pay\n(M - F)"
) +
theme_minimal() +
theme(panel.grid = element_blank(),
axis.line = element_line(colour = "black")) +
geom_hline(yintercept = 0, linetype = "dashed") +
coord_flip()
This plot appears to show that the larger a company is, the larger the pay gap between men and women (with men earning more per hour).
A line plot could be good for this